Rating Prediction of Cafe on Google Maps¶
Datasets¶
In [1]:
import os
import json
import gzip
from functools import partial
from datetime import datetime, timezone
import re
from collections import defaultdict
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster
from shapely.geometry import shape, Point
from shapely.prepared import prep
from functools import lru_cache
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
import ast
import geopandas as gpd
from shapely.geometry import Point
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import r2_score
torch.manual_seed(0)
Out[1]:
<torch._C.Generator at 0x3094f9230>
Downloading Dataset¶
In [2]:
meta_path = "./datasets/raw/meta-California.json.gz"
meta_keys = ["gmap_id", "name", "latitude", "longitude", "category", "avg_rating", "num_of_reviews", "price", "hours"]
review_path = "./datasets/raw/review-California.json.gz"
review_keys = ["gmap_id", "user_id", "name", "time", "rating"]
total_reviews = 70529977
In [3]:
def download_meta_data():
url = "https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/meta-California.json.gz"
res = requests.get(url, stream=True)
with open(meta_path, "wb") as f:
f.write(res.content)
In [4]:
def download_review_data():
url = "https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/review-California.json.gz"
res = requests.get(url, stream=True)
with open(review_path, "wb") as f:
f.write(res.content)
In [5]:
os.makedirs("./datasets/raw", exist_ok=True)
os.makedirs("./datasets/processed", exist_ok=True)
if not os.path.exists(meta_path):
download_meta_data()
if not os.path.exists("./datasets/raw/review-California.json.gz"):
download_review_data()
Processing Dataset¶
In [6]:
def parse(path):
g = gzip.open(path, "r")
for l in g:
yield json.loads(l)
Processing business data to extract Cafes we want to focus on.
In [7]:
# I reused my code from COGS108 project to process dataset.
def get_cafe_categories():
categories = []
for business in parse(meta_path):
if business["category"] is not None:
categories += business["category"]
categories = np.array(categories)
unique = np.unique(categories)
cafe_categories = [str(category) for category in unique if "cafe" in category.lower() or "coffee" in category.lower()]
print(f"The number of categories containting 'cafe' substring is {len(cafe_categories)}")
print(cafe_categories)
with open(f"./datasets/processed/cafe_categories.txt", "w") as f:
f.write("\n".join(cafe_categories))
def filter_by_category(data, categories):
category = data.get("category", None)
if category is None:
return False
return len(set(category) & categories) != 0
def filter_by_num_reviews(data, min_num_reviews):
return data["num_of_reviews"] >= min_num_reviews
def filter_raw_business_data(filters):
businesses = []
for business in parse(meta_path):
if all([f(data=business) for f in filters]):
business = {key: business.get(key, None) for key in meta_keys}
businesses.append(business)
print(f"We obtained total of {len(businesses)} after filtering")
df = pd.DataFrame(businesses)
df.to_csv(f"./datasets/processed/cafes.csv", index=False)
In [8]:
if not os.path.exists("./datasets/processed/cafe_categories.txt"):
get_cafe_categories()
if not os.path.exists("./datasets/processed/cafes.csv"):
min_num_reviews = 100
with open("./datasets/processed/cafe_categories.txt", "r") as f:
cafe_categories = set(f.read().split("\n"))
cafe_filter = partial(filter_by_category, categories=cafe_categories)
num_reviews_filter = partial(filter_by_num_reviews, min_num_reviews=min_num_reviews)
filter_raw_business_data([cafe_filter, num_reviews_filter])
Processing review data to extract reviews we want to focus on.
In [9]:
# I reused my code from COGS108 project to process dataset.
def filter_by_gmap_id(data, gmap_ids):
gmap_id = data.get("gmap_id", None)
if gmap_id is None:
return False
return gmap_id in gmap_ids
def filter_raw_review_data(filters):
reviews = []
for review in tqdm.tqdm(parse(review_path), total=total_reviews):
if all([f(data=review) for f in filters]):
review = {key: review.get(key, None) for key in review_keys}
review["review_id"] = f"{review['user_id']}_{review['gmap_id']}"
reviews.append(review)
print(f"We obtained total of {len(reviews)} after filtering")
df = pd.DataFrame(reviews)
df.to_csv("./datasets/raw/cafe_reviews.csv", index=False)
def extract_user_ids(reviews, min_num_reviews):
user_ids = reviews["user_id"].dropna().values
unique, counts = np.unique(np.array(user_ids), return_counts=True)
users = pd.DataFrame({"user_id": unique, "num_reviews": counts})
users = users[users["num_reviews"] >= min_num_reviews].reset_index(drop=True)
print(f"We extracted {users.shape[0]} users after filtering.")
users.to_csv("./datasets/processed/users.csv", index=False)
def filter_by_user_ids(reviews, user_ids):
reviews = reviews[reviews["user_id"].isin(user_ids)]
print(f"We extracted {reviews.shape[0]} reviews after filtering.")
reviews.to_csv("./datasets/processed/reviews.csv", index=False)
In [10]:
if not os.path.exists("./datasets/raw/cafe_reviews.csv"):
gmap_ids = set(pd.read_csv("./datasets/processed/cafes.csv")["gmap_id"].values)
gmap_id_filter = partial(filter_by_gmap_id, gmap_ids=gmap_ids)
filter_raw_review_data([gmap_id_filter])
if not os.path.exists("./datasets/processed/users.csv"):
print("Start processing user data")
reviews = pd.read_csv("./datasets/raw/cafe_reviews.csv")
min_num_reviews = 20
extract_user_ids(reviews, min_num_reviews)
if not os.path.exists("./datasets/processed/reviews.csv"):
print("Start filtering review data")
reviews = pd.read_csv("./datasets/raw/cafe_reviews.csv")
user_ids = pd.read_csv("./datasets/processed/users.csv")["user_id"].values
filter_by_user_ids(reviews, user_ids)
Split dataset into train, validation, and test so that we can evaluate models with unseen data. However, due to the design of the model which relies on pre-defined list of user and cafes, we need to split randomly without stratifying based on users or cafes.
In [11]:
def split_reviews():
file_name = "./datasets/processed/reviews.csv"
reviews = pd.read_csv(file_name).sample(frac=1, random_state=42)
valid_size = int(reviews.shape[0] * 0.1)
test_size = int(reviews.shape[0] * 0.1)
valid_reviews = reviews.iloc[:valid_size].reset_index(drop=True)
test_reviews = reviews.iloc[valid_size: valid_size + test_size].reset_index(drop=True)
train_reviews = reviews.iloc[valid_size + test_size:].reset_index(drop=True)
print(f"train: {train_reviews.shape[0]} / valid: {valid_reviews.shape[0]} / test: {test_reviews.shape[0]}")
os.makedirs("./datasets/splits", exist_ok=True)
train_reviews.to_csv("./datasets/splits/train.csv", index=False)
valid_reviews.to_csv("./datasets/splits/valid.csv", index=False)
test_reviews.to_csv("./datasets/splits/test.csv", index=False)
In [12]:
if not os.path.exists("./datasets/splits/train.csv"):
split_reviews()
EDA¶
Overview¶
Review Time¶
In [13]:
# One Hot Encoding for Unix Time Weekday
def unix_weekday_to_onehot(time):
feature_weekday = [0]*7
day = datetime.fromtimestamp(time / 1000, tz=timezone.utc).weekday()
feature_weekday[day] = 1.
return feature_weekday
# One Hot Encoding for Unix Time Hour
def unix_hour_to_onehot(time):
feature_dayhour = [0]*24
hr = datetime.fromtimestamp(time / 1000, tz=timezone.utc).hour
feature_dayhour[hr] = 1.
return feature_dayhour
In [14]:
reviews = pd.read_csv("./datasets/processed/reviews.csv")
reviews_time = reviews.copy()
# making sure the rating is numeric
reviews_time["rating"] = pd.to_numeric(reviews_time["rating"], errors="coerce")
# converting Unix ms to datetime
reviews_time["timestamp"] = pd.to_datetime(
reviews_time["time"],
unit="ms"
)
reviews_time["date"] = reviews_time["timestamp"].dt.date
reviews_time["month"] = reviews_time["timestamp"].dt.to_period("M").dt.to_timestamp()
# computing average rating per month and number of reviews per month
time_stats_all = (
reviews_time
.dropna(subset=["rating"])
.groupby("month")
.agg(
avg_rating=("rating", "mean"),
num_reviews=("rating", "count")
)
.reset_index()
)
min_reviews = 100
ts_global = time_stats_all[
(time_stats_all["num_reviews"] >= min_reviews)
& (time_stats_all["month"] >= "2008-01-01")
].sort_values("month")
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
ax = axes[0]
ax.plot(ts_global["month"], ts_global["avg_rating"], marker="o", linestyle="-")
ax.set_xlabel("Review month")
ax.set_ylabel("Average rating")
ax.set_title("Average review rating over time")
ax.tick_params(axis="x", rotation=45)
ax.set_ylim(3.0, 5.0) # rating range 3–5
ax = axes[1]
ax.plot(ts_global["month"], ts_global["num_reviews"], marker="o", linestyle="-")
ax.set_xlabel("Review month")
ax.set_ylabel("# Reviews")
ax.set_title("Number of reviews per month")
ax.tick_params(axis="x", rotation=45)
fig.suptitle("Review Time", fontsize=14)
fig.tight_layout()
plt.show()
Review Period¶
In [15]:
# One Hot Encoding for Period
def unix_period_to_onehot(unix_ms):
"""
0 -> before 2016
1 -> 2016-2019
2 -> 2020 and later
"""
if pd.isna(unix_ms):
return np.nan
try:
t = int(unix_ms)
except (ValueError, TypeError):
return np.nan
b2016_ms = int(pd.Timestamp("2016-01-01").timestamp() * 1000)
b2020_ms = int(pd.Timestamp("2020-01-01").timestamp() * 1000)
if t < b2016_ms:
return [0, 0, 0]
elif t < b2020_ms:
return [0, 1., 0]
else:
return [0, 0, 1.]
In [16]:
reviews_time = reviews.copy()
reviews_time["rating"] = pd.to_numeric(reviews_time["rating"], errors="coerce")
reviews_time["timestamp"] = pd.to_datetime(reviews_time["time"], unit="ms")
reviews_time["month"] = reviews_time["timestamp"].dt.to_period("M").dt.to_timestamp()
time_stats_all = (
reviews_time
.dropna(subset=["rating"])
.groupby("month")
.agg(
avg_rating=("rating", "mean"),
num_reviews=("rating", "count")
)
.reset_index()
)
min_reviews = 100
boundary_2016 = pd.Timestamp("2016-01-01")
boundary_2020 = pd.Timestamp("2020-01-01")
# Label function for plotting
def label_period(ts):
if ts < boundary_2016:
return "pre-2016"
elif ts < boundary_2020:
return "2016-2019"
else:
return "2020+"
time_stats_period = time_stats_all.copy()
time_stats_period["period"] = time_stats_period["month"].apply(label_period)
fig, axes = plt.subplots(1, 3, figsize=(18, 4))
period_order = ["pre-2016", "2016-2019", "2020+"]
for ax, label in zip(axes, period_order):
ts = time_stats_period[time_stats_period["period"] == label].copy()
ts = ts[ts["num_reviews"] >= min_reviews]
if ts.empty:
ax.set_visible(False)
continue # in case the early period has no data
ts = ts.sort_values("month")
ax.plot(ts["month"], ts["avg_rating"], marker="o", linestyle="-")
ax.set_xlabel("Review month")
ax.set_ylabel("Average rating")
ax.set_title(f"Average review rating over time ({label})")
ax.tick_params(axis="x", rotation=45)
ax.set_ylim(3.0, 5.1) # focusing on reasonable ratings
fig.suptitle("Review Periods", fontsize=14)
fig.tight_layout()
plt.show()
Chain¶
In [17]:
def get_chains_dict(cafes):
names, counts = np.unique(cafes["name"], return_counts=True)
indices = np.argsort(counts)[::-1]
sorted_counts = counts[indices]
sorted_names = names[indices]
stems = defaultdict(int)
for name in sorted_names:
words = name.lower().strip().split()
for i in range(len(words)):
stems[" ".join(words[:(i+1)])] += 1
chains = {}
for name, count in zip(sorted_names, sorted_counts):
if count > 5:
chains[name] = 2
continue
words = name.lower().strip().split()
stem_matches = []
for i in range(len(words)):
stem_matches.append(stems[" ".join(words[:(i+1)])])
if len(stem_matches) == 1 or len(stem_matches) > 10:
chains[name] = 0
continue
if len(stem_matches) >= 2 and sum(stem_matches[1:]) < 10:
chains[name] = 0
continue
if len(stem_matches) >= 3 and sum(stem_matches[2:]) < 5:
chains[name] = 0
continue
chains[name] = 1
return chains
In [18]:
cafes = pd.read_csv("../datasets/processed/cafes.csv")
# Counting how many times each cafe name appears
name_counts = cafes["name"].value_counts()
cafes["chain_size"] = cafes["name"].map(name_counts)
CHAIN_SIZE_THRESHOLD = 10
def classify_chain(size):
if pd.isna(size) or size <= 1:
return "Non Chain"
elif size < CHAIN_SIZE_THRESHOLD:
return "Sub Chains"
else:
return "Chains"
cafes["chain_category"] = cafes["chain_size"].apply(classify_chain)
# Merge with reviews to get ratings for each category
df = reviews.merge(
cafes[["gmap_id", "chain_category"]],
on="gmap_id",
how="left"
)
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
chain_avg = (
df.dropna(subset=["rating", "chain_category"])
.groupby("chain_category")["rating"]
.mean()
.reindex(["Non Chain", "Sub Chains", "Chains"])
)
fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(chain_avg.index, chain_avg.values)
ax.set_xlabel("Chain category")
ax.set_ylabel("Average review rating")
ax.set_title("Average Rating of Cafes by Non Chains, Sub Chains, and Chains")
plt.tight_layout()
plt.show()
Price¶
In [19]:
# One Hot Encoding for Price
def price_to_onehot(price):
feature_price = [0]*4
if price is not np.nan:
feature_price[len(price)-1] += 1.
return feature_price
In [20]:
cafes = pd.read_csv("../datasets/processed/cafes.csv")
def price_to_num(p):
if pd.isna(p):
return np.nan
p = str(p).strip()
if p == "" or p.lower() == "none":
return np.nan
n = p.count("$")
return n if n > 0 else np.nan
cafes["price_num"] = cafes["price"].apply(price_to_num)
df = reviews.merge(
cafes[["gmap_id", "name", "latitude", "longitude", "price_num", "avg_rating"]],
on="gmap_id",
how="left"
)
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
# Price vs rating - data prep
cafe_avg_price = df.groupby("gmap_id").agg(
avg_user_rating=("rating", "mean"),
price_num=("price_num", "first")
).dropna()
levels = sorted(cafe_avg_price["price_num"].unique())
means = cafe_avg_price.groupby("price_num")["avg_user_rating"].mean()
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Price vs rating
ax = axes[0]
ax.boxplot(
[cafe_avg_price[cafe_avg_price["price_num"] == k]["avg_user_rating"] for k in levels],
labels=[int(k) for k in levels]
)
ax.set_xlabel("Price level (# of $)")
ax.set_ylabel("Average review rating")
ax.set_title("Price vs rating (cafe-level)")
# Mean rating by price level
ax = axes[1]
ax.bar(means.index.astype(int), means.values)
ax.set_xlabel("Price level (# of $)")
ax.set_ylabel("Mean rating")
ax.set_title("Mean rating by price level")
fig.suptitle("Price", fontsize=14)
fig.tight_layout()
plt.show()
/var/folders/d9/8g_dsxvn7vngpll11tdl25j80000gq/T/ipykernel_37447/4126261889.py:34: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. ax.boxplot(
Open Hours¶
In [21]:
# Converts time text to values
def parse_time(t):
t = t.strip().upper()
# Match hh or hh:mm formats
m = re.match(r"(\d{1,2})(?::(\d{2}))?(AM|PM)", t)
if not m:
raise ValueError(f"Invalid time format: {t}")
hour = int(m.group(1))
minute = int(m.group(2) or 0)
period = m.group(3)
# Convert to 24-hour
if period == "AM":
if hour == 12:
hour = 0
else: # PM
if hour != 12:
hour += 12
return hour + minute / 60.0
# One Hot Encoding for Open Hours
def hours_to_onehot(hour_str):
if hour_str is None or hour_str is np.nan:
return [0,0,0]
before_noon = 0
after_noon = 0
hours = ast.literal_eval(hour_str)
for entry in hours:
if entry[1] == "Open 24 hours":
return [1.,0,0]
if entry[1] == "Closed":
continue
open_str, close_str = entry[1].split("–")
try:
start_hr = int(np.floor(parse_time(open_str)))
except ValueError:
return [0,0,0]
if start_hr < 13:
before_noon += 1.
else: after_noon += 1.
if before_noon > after_noon:
return [0,1.,0]
return [0,0,1.]
def open_hours_category(hour_str):
onehot = hours_to_onehot(hour_str)
if onehot == [1., 0, 0]:
return "24 hours"
elif onehot == [0, 1., 0]:
return "Opens before noon"
elif onehot == [0, 0, 1.]:
return "Opens after noon"
else:
return "Unknown/No hours"
In [22]:
cafes["open_hours_category"] = cafes["hours"].apply(open_hours_category)
# Merging with reviews to get ratings per category
df = reviews.merge(
cafes[["gmap_id", "open_hours_category"]],
on="gmap_id",
how="left"
)
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
open_hours_avg = (
df.dropna(subset=["rating", "open_hours_category"])
.groupby("open_hours_category")["rating"]
.mean()
.reindex(["24 hours", "Opens before noon", "Opens after noon", "Unknown/No hours"])
)
fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(open_hours_avg.index.tolist(), open_hours_avg.values.tolist())
ax.set_xlabel("Open-hours category")
ax.set_ylabel("Average review rating")
ax.set_title("Average Rating by Open-hours Category")
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()
Location¶
In [23]:
@lru_cache(maxsize=1)
def get_counties_ca():
counties = gpd.read_file("resources/cb_2018_us_county_500k.shp")
counties_ca = counties[counties["STATEFP"] == "06"] # California only
counties_ca = counties_ca.sort_values("NAME").reset_index(drop=True)
counties_ca["COUNTY_NUM"] = counties_ca.index
counties_ca = counties_ca.set_geometry("geometry")
_ = counties_ca.sindex
return counties_ca
def get_county(lat, lon):
counties_ca = get_counties_ca()
point = Point(lon, lat) # geometry expects (lon, lat)
idx = list(counties_ca.sindex.intersection(point.bounds))
if not idx:
return None
candidates = counties_ca.iloc[idx]
matches = candidates[candidates.contains(point)]
return int(matches.iloc[0]["COUNTY_NUM"]) if len(matches) else None
def location_to_onehot(location):
feature_county =[0]*58 # 58 counties in Cali
if location is not None:
feature_county[location] = 1.
return feature_county
In [24]:
# Folium Visualization:
cafes_map = cafes.dropna(subset=["latitude","longitude"]).copy()
cafes_map["avg_rating"] = pd.to_numeric(cafes_map["avg_rating"], errors="coerce")
# base map
map = folium.Map(
location=[36.5, -119.5],
zoom_start=6
)
# california outline from github
ca_geojson_url = "https://raw.githubusercontent.com/glynnbird/usstatesgeojson/master/california.geojson"
folium.GeoJson(
ca_geojson_url,
name="California outline",
style_function=lambda feature: {
"fillColor": "#ffffff",
"color": "black",
"weight": 3,
"fillOpacity": 0.05
}
).add_to(map)
# california counties from github
counties_url = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/california-counties.geojson"
counties_geo = requests.get(counties_url).json()
# converting counties to polygons for averages
county_polys = []
for feat in counties_geo["features"]:
county_name = feat["properties"]["name"]
poly = prep(shape(feat["geometry"]))
county_polys.append((county_name, poly))
# assigning county to each cafe based on lat/long
def find_county(lat, long):
pt = Point(long, lat)
for cname, poly in county_polys:
if poly.contains(pt):
return cname
return np.nan
cafes_map["county"] = cafes_map.apply(
lambda r: find_county(r["latitude"], r["longitude"]),
axis=1
)
# getting average rating per county
county_stats = (
cafes_map.dropna(subset=["county"]) # removes cafes without county label so they aren't computed in average
.groupby("county")
.agg(avg_rating=("avg_rating", "mean"))
.reset_index()
)
# choropleth coloring counties by average rating
folium.Choropleth(
geo_data=counties_geo,
name="Average rating per county",
data=county_stats,
columns=["county", "avg_rating"],
key_on="feature.properties.name",
fill_color="YlGnBu",
fill_opacity=0.7,
line_opacity=0.3,
nan_fill_color="white",
legend_name="Average cafe rating"
).add_to(map)
# cafe markers
cluster = MarkerCluster(name="Cafe markers").add_to(map)
for _, r in cafes_map.iterrows():
popup = (
f"{r.get('name','')}"
f"<br>Rating: {r.get('avg_rating',np.nan)}"
f"<br>Price: {r.get('price','')}"
f"<br>County: {r.get('county','')}"
)
folium.CircleMarker(
location=[r["latitude"], r["longitude"]],
radius=2.5,
color="black",
weight=0.5,
fill=True,
fill_opacity=0.7,
popup=popup
).add_to(cluster)
# Toggle panel so we can show/hide features
folium.LayerControl(collapsed=False).add_to(map)
map
Out[24]:
Make this Notebook Trusted to load map: File -> Trust Notebook